## Read in data
datadir = "/home/shyun/Dropbox/research/usc/ocean-provinces/omd/data"
filenames = c("tabulated_darwin_montly_clim_045_090_ver_0_2.csv",
              "tabulated_darwin_montly_clim_090_180_ver_0_2.csv",
              "tabulated_darwin_montly_clim_180_360_ver_0_2.csv",
              "tabulated_darwin_montly_clim_360_720_ver_0_2.csv",
              "tabulated_geospatial_montly_clim_045_090_ver_0_2_5.csv",
              "tabulated_geospatial_montly_clim_090_180_ver_0_2_5.csv",
              "tabulated_geospatial_montly_clim_180_360_ver_0_2_5.csv",
              "tabulated_geospatial_montly_clim_360_720_ver_0_2_5.csv",
              "tabulated_geospatial_montly_clim_045_090_ver_0_2.csv",
              "tabulated_geospatial_montly_clim_090_180_ver_0_2.csv",
              "tabulated_geospatial_montly_clim_180_360_ver_0_2.csv",
              "tabulated_geospatial_montly_clim_360_720_ver_0_2.csv")

Earthmover’s distance

Earthmover’s distance (EMD) is defined as a function of the optimal flow \(F = [f_ij]\) and the ground distance

\[E M D(P, Q)=\frac{\sum_{i=1}^{m} \sum_{j=1}^{n} d_{i j} f_{i j}}{\sum_{i=1}^{m} \sum_{j=1}^{n} f_{i j}}\]

(Taken directly from Orlova et al. https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0151859)

New day, new box

Here is the new, (yellow) box that we will examine.

dat = read.csv(file.path(datadir, filenames[1]))
mydat = dat[which(dat[,"month"]==1),]
colfun = colorRampPalette(c("blue", "red"))
par(mar=c(0,0,0,0))
map("world", fill=TRUE, col="white", bg="white", ylim=c(-90, 90), mar=c(0,0,0,0))
## plot(newmap)##, xlim = c(-20, 59), ylim = c(35, 71), asp = 1)
centers = sample(mydat$Chl, 5)
points(mydat[ ,c("lon", "lat")], pch=15, cex=4,
       col=colfun(20)[as.numeric(cut(mydat$Chl,breaks = 20))])
map("world", fill=TRUE, col="white", bg="white", ylim=c(-90, 90), mar=c(0,0,0,0), add=TRUE)
## New box 
lonrange = c(-180, -135)
latrange = c(20, 45)
rect(lonrange[1], latrange[1], lonrange[2], latrange[2],
     border = c("yellow"), lwd=5)

Darwin data (Jan vs Feb)

First, visualize the January and February chlorophyll data (Darwin).

Values are all normalized to sum to 1.

dat = read.csv(file.path(datadir, filenames[2]))
## Jan
mydat = dat[which(dat[,"month"]==1),]
jan.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
                      latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
jan.mat = make_mat(jan.dat)
jan.mat = jan.mat/sum(jan.mat)
drawmat_precise2(jan.mat, main="Jan, Darwin")

## Feb
mydat = dat[which(dat[,"month"]==2),]
feb.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
                      latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
feb.mat = make_mat(feb.dat)
feb.mat = feb.mat/sum(feb.mat)
drawmat_precise2(feb.mat, main="Feb, Darwin")

Now, we calculate optimal transport and make a “vector map” of mass movement.

## Get optimal transport 
res <- transport(pgrid(jan.mat), pgrid(feb.mat), p=2,
                 method="aha")

## Plot both
plot(pgrid(jan.mat), pgrid(feb.mat), res,
     ## Style options
     mass = "thickness", acol = rgb(0,0,1,0.5),
     rot = TRUE)
title(main="Jan to Feb, Darwin")

Now, we highlight these arrows for four ranges (0-25%, 25%-50%, 50%-75%, 75%-100%) of the magnitude of the mass transfer.

## Separate them by magnitude of the mass transfer
nbin = 4

## cutoffs = seq(from = min(res$mass), to = max(res$mass), length = nbin+1)
cutoffs = quantile(res$mass)

par(mfrow=c(2,2))
for(ii in 1:nbin){
  massrange = cutoffs[ii:(ii+1)]
  ## cols = rep(rgb(0,0,1,0.2), length(res$mass))
  cols = rep(NA, length(res$mass))
  cols[which(massrange[1] < res$mass & res$mass < massrange[2])] = rgb(1,0,0, 0.7)
  plot(pgrid(jan.mat), pgrid(feb.mat), res,
       ## Style options
       mass="thickness", acol=cols,
       rot = TRUE)
  title(main = paste("Magnitude", c("Lowest", "Second to lowest",
                                    "Second to  highest", "Highest")[ii]),
        cex.main = 3)
}

From the optimal transport, we directly know the scalar earthmovers distance, by the formula:

\[E M D(P, Q)=\frac{\sum_{i=1}^{m} \sum_{j=1}^{n} d_{i j} f_{i j}}{\sum_{i=1}^{m} \sum_{j=1}^{n} f_{i j}}\]

Darwin (between adjacent months)

all.mats = list()
all.pmats = list()
for(ii in 1:12){
  ## One month's data
  mydat = dat[which(dat[,"month"]==ii),]
  one.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
                     latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
  one.mat = make_mat(one.dat)
  one.mat = one.mat/sum(one.mat)
  all.mats[[ii]] = one.mat
  all.pmats[[ii]] = pgrid(one.mat)
}

Now, we obtain all twelve months’ Darwin data and visualize optimal transports in adjacent months (Jan to Feb, Feb to March, and so on). Note, I highlighed the top 10% high-mass transfers in red.

par(mfrow=c(6,2))
for(ii in 1:11){
  pgrid1 = pgrid(all.mats[[ii]])
  pgrid2 = pgrid(all.mats[[ii+1]])
  res <- transport(pgrid1, pgrid2, p=2,
                   method="aha")


  ## nbin = 2
  ## cutoffs = seq(from = min(res$mass), to = max(res$mass), length = nbin+1)
  cutoffs = quantile(res$mass, probs = c(0.9,1))
  jj = 1
  massrange = cutoffs[jj:(jj+1)]
  cols = rep(rgb(0,0,1,0.2), length(res$mass))
  cols[which(massrange[1] < res$mass & res$mass < massrange[2])] = rgb(1,0,0,0.7)

  plot(pgrid1, pgrid2, res,
       ## Style options
       mass="thickness", acol=cols,
       ## lwd=0.1,
       rot=TRUE, length=0.2)

  title(main = paste0( month.abb[ii], " to ", month.abb[ii+1]), cex.main=2)
}

Earth Mover’s Distance Matrix

Now, let’s form a 24 x 24 distance matrix, which encodes all the pairwise earth mover’s distances between all months in real data and Darwin data.

## DARWIN
all.pmats.darwin = list()
dat = read.csv(file.path(datadir, filenames[2]))
nmonth = 12
for(ii in 1:nmonth){
  ## One month's data
  mydat = dat[which(dat[,"month"]==ii),]
  one.dat = mydat[which(lonrange[1] <= mydat[,"lon"] & mydat[, "lon"] <= lonrange[2] &
                        latrange[1] <= mydat[,"lat"] & mydat[, "lat"] <= latrange[2]),]
  one.mat = make_mat(one.dat)
  ## one.mat = one.mat[-nrow(one.mat),]
  one.mat = one.mat/sum(one.mat, na.rm=TRUE)
  one.mat[which(is.na(one.mat))] = 0
  all.pmats.darwin[[ii]] = pgrid(one.mat)
}

## REAL
dat = read.csv(file.path(datadir, filenames[6])) ## real
all.pmats.real = list()
for(ii in 1:nmonth){
  ## if(ii==1) jj=2 ## replace this...................
  ## if(ii==12) jj=11
  ## if(ii %ni% c(1,12)) jj=ii

  ## One month's data
  mydat = dat[which(dat[,"month"]==ii),]
  one.dat = mydat[which(lonrange[1] <= mydat[,"lon"] & mydat[, "lon"] <= lonrange[2] &
                        latrange[1] <= mydat[,"lat"] & mydat[, "lat"] <= latrange[2]),]
  one.mat = make_mat(one.dat)
  ## if(ii!=12) one.mat = one.mat[-nrow(one.mat),]
  one.mat = one.mat/sum(one.mat, na.rm=TRUE)
  ## one.mat[which(is.na(one.mat))] = 0
  all.pmats.real[[ii]] = pgrid(one.mat)
}

all.pmats = c(all.pmats.real, all.pmats.darwin)
nmonth = 24
distmat = matrix(NA, nmonth, nmonth)
## mclapply(1:(nmonth)^2, function(imonth){
##   ii = imonth %% 12 + 1
##   jj = imonth - 12 * (ii-1)
for(ii in 1:nmonth){
  for(jj in 1:nmonth){
    if(ii>jj){
      res <- transport(all.pmats[[ii]], all.pmats[[jj]], p=2, method="aha")
      dist = wasserstein(all.pmats[[ii]], all.pmats[[jj]], tplan=res)
      distmat[ii,jj] = dist
      distmat[jj,ii] = dist
    }
  }
} 
## }, mc.cores=8)

diag(distmat) = 0 
diag(distmat) = NA
colnames(distmat) = rownames(distmat) = c(paste0("r", 1:(nmonth/2)),
                                          paste0("d", 1:(nmonth/2)))
## image(Matrix(distmat))  
library(lattice)
drawmat_precise2(distmat, contour=FALSE, main="All months' EMD from Real & Darwin data",
                 panel = function(...){
                   panel.levelplot(...)
                   lattice::panel.abline(v = 12.5, lwd=3)
                   lattice::panel.abline(h = 12.5, lwd=3)
                 })

## abline(v=13, lwd=5)
## knitr::kable(as.data.frame(signif(distmat,3)))

We can take a look at certain distances; from left to right in the boxplot.

  1. Between Darwin and Real.
  2. Among Darwin data.
  3. Among Real data.
  4. Among Darwin data, adjacent months.
  5. Among Real data, adjacent months.

Compared to the larger white box before we can see that the distances between Darwin data and real data (left-most box) is even more separated from the rest of the boxplots.

diag(distmat) = 0 
## Extract certain things
pairdists = sapply(1:12, function(ii){
  distmat[ii, ii+12]
})
distmat.real = distmat[1:12, 1:12]
distmat.darwin = distmat[13:24, 13:24]

## Combine them into a list
dists = list(pair= pairdists,
             darwin = distmat.darwin[lower.tri(distmat.darwin)],
             real = distmat.real[lower.tri(distmat.real)],
             darwin_serial = unlist(diag(distmat.darwin[-1,-ncol(distmat.darwin)])),
             real_serial = unlist(diag(distmat.real[-1,-ncol(distmat.real)])))

## Clean
dists = lapply(dists, function(a){
  if(any(is.na(a))){
    return(a[-which(is.na(a))])
  } else {
    return(a)
  }
})

names(dists)=c("darwin-to-real","darwin", "real", "darwin m2m", "real m2m")
boxplot(dists,
        col=c(rgb(0,0,1,0.2), rep(rgb(1,0,0,0.2),2), rep(rgb(0,1,0,0.2),2)),
        main="Distributions of Earth mover's distances", cex.main=2, cex.lab=2)

Clustering and Multidimensional Scaling

From a distance matrix containing all pairwise distances, we can produce more interesting analyses and visualizations.

Hierarchical clustering from the pairwise distances: at the beginning of the process, each element is in a cluster of its own. The clusters are then sequentially combined into larger clusters until all elements end up being in the same cluster.

colnames(distmat) = rownames(distmat) = c(paste0("real-", 1:(nmonth/2)),
                                          paste0("darwin-", 1:(nmonth/2)))
distmat <- as.dist(distmat, diag = TRUE)

labelCol <- function(x) {
  if (is.leaf(x)) {
    ## fetch label
    label <- attr(x, "label") 
    ## set label color to red for A and B, to blue otherwise
    attr(x, "nodePar") <- list(lab.col=ifelse(sapply(label, substr, 1,4) == "real", "blue", "red"))
  }
  return(x)
}

## plot(hclust(distmat), axes=FALSE)
hc = hclust(distmat)
d <- dendrapply(as.dendrogram(hc), labelCol)
plot(d, axes=FALSE)

Multidimensional scaling (MDS) is a dimension reduction technique to translate + visualize the level of similarity of \(n\) individual cases of a dataset. MDS translates pairwise ‘distances’ among a set of \(n=24\) objects into a configuration of \(n=24\) points mapped into a lower-dimensional Cartesianspace (e.g. 2D) while preserving the pairwise distances.

## Multidimensional scaling
fit <- cmdscale(distmat, eig=TRUE, k=2) # k is the number of dim

# plot solution
x <- fit$points[,1]
y <- fit$points[,2]
par(pty="s")
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2",
     main="Metric MDS", type="n")
text(x, y, labels = c(paste0(1:(nmonth/2)),
                      paste0(1:(nmonth/2))),
     col =  c(rep("blue", nmonth/2), rep("red", nmonth/2)),
     cex=2)
legend("topright", fill=c('blue', 'red'), legend = c("real", "Darwin"))

Each data source is closer to each other, more so than the same month from the two data sources.

Some subtleties

What to do for optimal transport in the presence of land mass to obstruct movement (water can’t cross a peninsula)? One can simply modify the distances so that, the Euclidean distance is the default, but whenever the arrow meets a land mass, the distance is positive infinity so that it incurs an infinite cost for any mass to cross a landmass. (If it really needs to, it will go around the coast.)

Also, the Wasserstein’s distance metric does not seem to be comparable between different size objects. In other words,